﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using log4net;
using HtmlAgilityPack;
using DWH.Data;
using DWH.Utils;

namespace DWH.PP
{
    public abstract class PPBoardPullTaskBase : PPTaskBase
    {
        protected string pageEncoding;
        protected string contentPageUrlPattern;// = @"htm_data/14/1207/[\d+].html";
        protected string contentTargetElement;
        protected string contentTargetElementAttributeName;
        protected string contentTargetElementAttributeValue;

        private ILog logger;

        public override void Initialize()
        {
            logger = LogManager.GetLogger(typeof(PPBoardPullTaskBase));

            InitializeTaskInfoFromDB();

            pageEncoding = ExtendData[DWHConstants.KEY_PAGE_ENCODING];

            contentPageUrlPattern = ExtendData[DWHConstants.KEY_CONTENT_PAGE_URL_PATTERN];
            
            contentTargetElement = ExtendData[DWHConstants.KEY_CONTENT_TARGET_ELEMENT];
            contentTargetElementAttributeName = ExtendData[DWHConstants.KEY_CONTENT_TARGET_ELEMENT_ATTRIBUTE_NAME];
            contentTargetElementAttributeValue = ExtendData[DWHConstants.KEY_CONTENT_TARGET_ELEMENT_ATTRIBUTE_VALUE];            
        }

        public abstract void ProcessPageList(string currentPageListUrl);
       

        /*
         <table width='99%' height="100%" align=center cellspacing=0 cellpadding=6 style='TABLE-LAYOUT: fixed;WORD-WRAP: break-word'>
        <tr height="100%"><td bgcolor='#FFFFEE' colspan=2 valign=top>
        <!---->
        <hr color='#FFFFEE' size=1>


        <span class='tpc_title'>xxxxxxxxxxxxxxx[10P]</span><br>
        <!---->
        <br><br> <img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/1.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/1.jpg');" ><br><img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/2.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/2.jpg');" ><br><img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/3.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/3.jpg');" ><br><img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/4.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/4.jpg');" ><br><img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/5.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/5.jpg');" ><br><img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/6.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/6.jpg');" ><br><img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/7.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/7.jpg');" ><br><img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/8.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/8.jpg');" ><br><img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/9.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/9.jpg');" ><br><img src='http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/10.jpg' border=0 onclick="window.open('http://www2.lookpipe.com/get.php?filepath=http://www.ycompletiones.com/pic2/a37371ea89j/zjs8903s/10.jpg');" ><br><b></b><br></span><br>
        <!---->
        </td></tr>
        <tr valign=bottom bgcolor='#FFFFEE'>
        <td colspan=2>
        <!---->
        </td></tr>
        <tr bgcolor='#FFFFEE' valign=bottom><td width=90%>
        <!---->
        <font color=red>[楼 主]</font>
        <!---->
        <b>Posted:</b>2012-07-16 04:57|</td>
        <td align=right>
        <a href="javascript:scroll(0,0)">顶端</a>
        </td></tr></table></td></tr></table>
         */
        public virtual void ProcessPageContent(string pageUrl)
        {
            logger.InfoFormat("processing ProcessPageContent for page:{0}", pageUrl);

            HtmlWeb web = new HtmlWeb();

            web.OverrideEncoding = Encoding.GetEncoding(pageEncoding);

            HtmlDocument doc = web.Load(pageUrl);

            string pageTitle = null, pageContent = null, pageContentForInsert = null;

            HtmlNodeCollection titles = doc.DocumentNode.SelectNodes("//title");
            if (titles.Count > 0)
            {
                pageTitle = titles[0].InnerText;
            }

            //ONLY one content target element is expected
            HtmlNodeCollection canEles = doc.DocumentNode.SelectNodes(contentTargetElement);
            foreach (HtmlNode element in canEles)
            {
                if (element.Attributes[contentTargetElementAttributeName] != null
                    //&& String.Compare(table.Attributes[contentTargetElementAttributeName].Value, contentTargetElementAttributeValue, true) == 0)
                    && DWHHelper.PatternMatch(element.Attributes[contentTargetElementAttributeName].Value, contentTargetElementAttributeValue))
                {
                    pageContent = element.OuterHtml;

                    if (!String.IsNullOrEmpty(pageTitle) && !String.IsNullOrEmpty(pageContent))
                    {
                        DWHDBHelper.AddPage(TaskName, pageUrl.GetHashCode(), pageUrl, pageTitle, pageContent, DateTime.Now);
                    }
                    
                    pageContentForInsert = ProcessTargetHtmlContent(pageUrl, pageContent);

                    //update pageContent;

                    if (pageContentForInsert != pageContent)
                    {
                        DWHDBHelper.UpdatePage(TaskName, pageUrl.GetHashCode(), pageUrl, pageTitle, pageContentForInsert, DateTime.Now);
                    }

                    break;
                }
            }            
        }

        public virtual string ProcessTargetHtmlContent(string pageUrl, string targetHtmlContent)
        {
            return targetHtmlContent;
        }

        protected virtual List<string> FilterPageListUrls(List<string> originalPageListUrls,string currentPageListUrl)
        {
            originalPageListUrls.Sort((x, y) => (-1) * String.Compare(x, y, true)); 
            return originalPageListUrls;
        }
    }
}
